1 Setup ids
2 single item checking
3 testing angle deltas
4 testing cluster_image_name
5 Cluster random samples of obsids



In [ ]:

    
from planet4.dbscan import DBScanner

from planet4 import io, clustering, plotting, markings, dbscan
import seaborn as sns
sns.set_context('notebook')
blotchcols = markings.Blotch.to_average
fancols = markings.Fan.to_average



In [ ]:

    
# not automatically initialized
%matplotlib ipympl



In [ ]:

    
import socket
if socket.gethostname().startswith('macd2860'):
    %config InlineBackend.figure_format = 'retina'



In [ ]:

    
%config InlineBackend.figure_format = 'png'



In [ ]:

    
from nbtools.logging import setup_live_logging
import logging
logger = setup_live_logging('planet4.dbscan', logging.DEBUG)

Setup ids



In [ ]:

    
def get_gold_ids(person):
    """Get gold data
    
    Pararemeters
    ------------
    person : {"GP", "MES", "KMA", "common_gold_data"}
    
    Returns
    -------
    pd.Series
    """
    path = Path("/Users/klay6683/Dropbox/Documents/latex_docs/p4_paper1/gold_data")
    return pd.read_csv(path / f"{person}.txt", header=None, squeeze=True)



In [ ]:

    
ids = get_gold_ids('common_gold_data')



In [ ]:

    
ids = 'br5 bu5 ek1 pbr 1dt 1dr 1fe dch bvc 1c5 1ab 1dk 18s 1b0 1cl 1ct 1at 1al 1aa 10p 185 139 13t 15k 17a'.split()



In [ ]:

    
def create_and_save_randoms():
    myids = np.random.choice(ids, 100)
    np.save('myids.npy', myids)

myids = np.load('myids.npy')

len(myids)



In [ ]:

    
combined = list(ids) + list(myids)



In [ ]:

    
%store combined



In [ ]:

    
db = DBScanner(savedir='gold_with_angle_std', do_large_run=True)



In [ ]:

    
for id_ in ids:
    print(id_)
    db.cluster_image_id(id_)



In [ ]:

    
bucket = []
for img_id in ids:
    p4id = markings.ImageID(img_id, scope='planet4', data=db.data)
    db.pm.obsid = p4id.image_name
    db.pm.id = img_id
    try:
        bucket.extend(db.pm.fandf.angle_std.values)
    except FileNotFoundError:
        continue



In [ ]:

    
len(bucket)



In [ ]:

    
bucket = np.array(bucket)



In [ ]:

    
import seaborn as sns



In [ ]:

    
sns.set_context('paper')



In [ ]:

    
bins = np.arange(0, 22, 1)



In [ ]:

    
pd.Series(bucket).to_csv("angle_std_bucket.csv", index=False)



In [ ]:

    
fig, ax = plt.subplots(constrained_layout=True)
sns.distplot(bucket, kde=False, bins=bins)
ax.set_title("Histogram of angular STD for merged fan clusters")
ax.set_xlabel("Fan angle standard deviation per cluster [deg]")
ax.set_ylabel("Histogram Counts")



In [ ]:

    
db.pm.fanfile



In [ ]:

    
db.pm.fandf.angle_std



In [ ]:

    
np.save('combined_ids_to_check.npy', np.array(combined))



In [ ]:

    
from nbtools import execute_in_parallel



In [ ]:

    
def process_id(id_):
    from planet4.dbscan import DBScanner
    db = DBScanner(savedir='newest_clustering_review', do_large_run=True)
    for kind in ['fan', 'blotch']:
        db.parameter_scan(id_, kind, 
                      msf_vals_to_scan=[0.1, 0.13],
                      eps_vals_to_scan=[20, 25, 30],
                      size_to_scan='large')

Here's my comments from the review"

APF0000br5 - seems like the big blotch should have been seen

APF0000bu5 - seems like middle fan should be there - seems too strict a cut not clustering issue?

APF0000ek1- yellow final blotch comes out of no where

APF0000pbr - bottom right blotch seems like it should have survived

APF00001dt - cyan fan seems bigger than it should be



In [ ]:

    
results = execute_in_parallel(process_id, combined)



In [ ]:

    
for id_ in ids:
    print(id_)
    for kind in ['blotch']:
        print(kind)
        dbscanner = DBScanner(savedir='do_cluster_on_large', do_large_run=True)
#         dbscanner.parameter_scan(kind, [0.1, 0.13], [30, 50, 70])
        # for blotch:
        dbscanner.cluster_and_plot(id_, kind, saveplot=True)
        plt.close('all')



In [ ]:

    
for id_ in ithaca_sample:
    print(id_)
    for kind in ['blotch']:
        print(kind)
        dbscanner = DBScanner(id_)
#         dbscanner.parameter_scan(kind, [0.1, 0.13], [30, 50, 70])
        # for blotch:
        dbscanner.parameter_scan(kind, [0.1, 0.13], [15, 22, 30])
        plt.close('all')



In [ ]:

    
for id_ in ithaca_sample:
    print(id_)
    for kind in ['fan']:
        print(kind)
        dbscanner = DBScanner(id_)
        dbscanner.parameter_scan(kind, [0.1, 0.13], [30, 50, 70])
        # for blotch:
#         dbscanner.parameter_scan(kind, [0.1, 0.13], [15, 22, 30])
        plt.close('all')



In [ ]:

    
from shapely.geometry import Point

p1 = Point(266.4, 470.56)
p2 = Point(262.072, 469.679)

p1.distance(p2)

single item checking



In [ ]:

    
%matplotlib ipympl



In [ ]:

    
from planet4.catalog_production import ReleaseManager



In [ ]:

    
rm = ReleaseManager('v1.0')
rm.savefolder



In [ ]:

    
db = DBScanner(savedir='examples_for_paper', do_large_run=True)



In [ ]:

    
db.eps_values



In [ ]:

    
db.cluster_and_plot('arp', 'fan')



In [ ]:

    
plotting.plot_image_id_pipeline('gr0', datapath='gold_per_obsid', via_obsid=True)



In [ ]:

    
plt.close('all')



In [ ]:

    
id_ = ids[14]



In [ ]:

    
db.parameter_scan(id_, 'fan', msf_vals_to_scan=(0.1, 0.13),
                  eps_vals_to_scan=(10, 20, 30), size_to_scan='small')



In [ ]:

    
plotting.plot_image_id_pipeline(id_, datapath=rm.savefolder, save=True, saveroot='./plots')



In [ ]:

    
data = io.DBManager().get_image_id_markings('arp')



In [ ]:

    
data.classification_id.nunique()



In [ ]:

    
data.groupby(['classification_id', 'user_name']).marking.value_counts()



In [ ]:

    
data[data.marking=='blotch'].shape



In [ ]:

    
db.parameter_scan('bsn', 'blotch', [0.10, 0.13], [10, 12, 14], size_to_scan='small', )



In [ ]:

    
v1 = (8.9, 87.3)
v2 = (19.8, 79.8)



In [ ]:

    
v1 = np.array(v1)
v2 = np.array(v2)



In [ ]:

    
from numpy.linalg import norm



In [ ]:

    
norm(v1 - v2)



In [ ]:

    
norm(np.array(v1), np.array(v2))



In [ ]:

    
db.save_results



In [ ]:

    
db.final_clusters['blotch']



In [ ]:

    
import seaborn as sns
sns.set_context('notebook')



In [ ]:

    
import itertools

palette = itertools.cycle(sns.color_palette('bright'))
fig, ax = plt.subplots()

for b in db.final_clusters['blotch'][1]:
    db.p4id.plot_blotches(data=b, user_color=next(palette), ax=ax)
    ax.set_title('second round')
fig.savefig('second_round.png', dpi=150)



In [ ]:

    
db.parameter_scan('1wg', 'fan', 
                  msf_vals_to_scan=[0.1, 0.13],
                  eps_vals_to_scan=[20, 25, 30],
                  size_to_scan='large')



In [ ]:

    
db.parameter_scan('15k', 'blotch', 
                  msf_vals_to_scan=[0.1, 0.13],
                  eps_vals_to_scan=[10, 12, 15],
                  size_to_scan='small')



In [ ]:

    
fig, ax = plt.subplots()
db.p4id.plot_blotches(ax=ax)
ax.set_title('input data')
fig.savefig('input_data.png', dpi=150)



In [ ]:

    
blotches = db.p4id.filter_data('blotch').dropna(how='all', axis=1)



In [ ]:

    
blotches['x y radius_1 radius_2 angle'.split()].sort_values(by='radius_1')



In [ ]:

    
fans = db.p4id.filter_data('fan')



In [ ]:

    
xyclusters = pd.concat(db.cluster_xy(blotches, 15)).dropna(how='all', axis=1)



In [ ]:

    
blotches.shape



In [ ]:

    
xyclusters.shape



In [ ]:

    
blotches[~blotches.isin(xyclusters).all(1)].shape



In [ ]:



In [ ]:



In [ ]:



In [ ]:

    
db.eps_values['blotch']['angle']= None



In [ ]:

    
db.eps_values['blotch']['angle']= 20



In [ ]:

    
db.eps_values['blotch']['radius']['small']=30



In [ ]:

    
db.eps_values



In [ ]:

    
db.parameter_scan('bp7', 'blotch', [0.1, 0.13], [15,22,30], 'small')



In [ ]:

    
db.cluster_image_id('bz7')



In [ ]:

    
db.cluster_and_plot('bz7', 'blotch')



In [ ]:

    
db.min_samples



In [ ]:

    
db.cluster_image_id('bb6')



In [ ]:

    
db.final_clusters['blotch'][0][4][markings.Blotch.to_average+['user_name']]



In [ ]:

    
db.final_clusters['blotch'][0][2][markings.Blotch.to_average+['user_name']]



In [ ]:



In [ ]:

    
%debug



In [ ]:

    
db.parameter_scan('blotch', [0.1, 0.13], [15, 22, 30])



In [ ]:

    
db.parameter_scan('fan', [0.1,0.15], [30, 50,70])



In [ ]:

    
db.pipeline(10, 3, 50)



In [ ]:

    
db.store_folder



In [ ]:

    
sizes = []
for _,b in blotches.iterrows():
    B = markings.Blotch(b, scope='planet4')
    sizes.append(B.area)



In [ ]:

    
%matplotlib nbagg



In [ ]:

    
plt.figure()
plt.hist(sizes, bins=50);



In [ ]:

    
db.parameter_scan('fan', [0.1,0.15], [10, 15, 20])



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:

    
db.cluster_and_plot('blotch', 20, 3)
ax = plt.gca()
ax.get_title()



In [ ]:



In [ ]:

    
db.parameter_scan('fan', [0.07, 0.1, 0.15], [15,20])



In [ ]:

    
db.parameter_scan('blotch', [0.07, 0.1, 0.15], [15,20])



In [ ]:

    
ek1.cluster_and_plot('blotch', 20, 3)



In [ ]:

    
ek1.p4id.plot_blotches(data=ek1.finalclusters[5])



In [ ]:

    
ek1.p4id.plot_blotches(data=ek1.averaged[5])



In [ ]:



In [ ]:



In [ ]:

    
p4id = markings.ImageID('1fe', scope='planet4')
blotches = p4id.get_blotches()



In [ ]:

    
X = blotches['x y'.split()]



In [ ]:

    
dbscanner = DBScanner(X, min_samples=5, eps=20)



In [ ]:

    
clusters = [blotches.loc[idx] for idx in dbscanner.clustered_indices]



In [ ]:

    
from planet4.clustering import cluster_angles



In [ ]:

    
bucket = []
for cluster in clusters:
    print(cluster.shape)
    bucket.append([cluster.loc[idx] for idx in cluster_angles(cluster, 'blotch', 5)])



In [ ]:

    
for item in bucket:
    for subitem in item:
        print(subitem.shape)



In [ ]:

    
cluster_and_plot('1dr', production=True, dynamic=True,
                         msf=msf, eps=eps, radii=False, dbscan=True,
                         figtitle=figtitle)



In [ ]:



In [ ]:

    
cm = cluster_and_plot('1dt', production=False, msf=0.1, dynamic=True,
                      radii=False, dbscan=False)



In [ ]:

    
df = pd.read_csv('fuckdf.csv')



In [ ]:

    
(df - df.mean(axis=0))/df.std(axis=0)



In [ ]:

    
df[df.apply(lambda x: np.abs(x - x.mean()) / x.std() < 1).all(axis=0)]



In [ ]:

    
from scipy.stats import zscore



In [ ]:

    
zscore??



In [ ]:

    
pd.DataFrame(zscore(df,ddof=1))



In [ ]:

    
def highlight_bigger_std(x):
    '''
    highlight the maximum in a Series yellow.
    '''
    is_true = (np.abs(x - x.mean()) / x.std() > 2)
    return ['background-color: yellow' if v else '' for v in is_true]
#     return is_true



In [ ]:

    
df.style.apply(highlight_bigger_std)



In [ ]:



In [ ]:



In [ ]:

    
cm = cluster_and_plot('pbr', production=False, msf=0.1, dynamic=True,
                      radii=False)



In [ ]:

    
cm = cluster_and_plot('pbr',eps=20, production=False, msf=0.1, dynamic=True,
                      radii=True)



In [ ]:

    
cm.db



In [ ]:

    
imgid = '1at'
imgid = 'dch'
imgid = 'bvc'
imgid = '1dr'
imgid = '1fe'
imgid = 'br5'
imgid = 'ek1'
p4id = markings.ImageID(imgid, scope='planet4')



In [ ]:

    
data = p4id.get_blotches()



In [ ]:

    
from planet4.dbscan import DBScanner



In [ ]:

    
current_X = data[['x','y']].values



In [ ]:

    
clusterer = DBScanner(current_X, eps=15, min_samples=3)



In [ ]:

    
clusterer.n_clusters_



In [ ]:

    
cluster = data.loc[clusterer.clustered_indices[0]]

p4id.plot_blotches(blotches=cluster,with_center=True)



In [ ]:

    
cluster[blotchcols]



In [ ]:

    
indices = clustering.cluster_angles(cluster, 'blotch', eps_blotchangle=10)
indices



In [ ]:

    
angle_cluster_data = cluster.loc[indices[0], blotchcols +['user_name']]



In [ ]:

    
angle_cluster_data



In [ ]:

    
df = angle_cluster_data[blotchcols]



In [ ]:

    
df[df.apply(lambda x: np.abs(x - x.mean()) / x.std() < 1).all(axis=1)]



In [ ]:

    
clustering.get_average_object(angle_cluster_data[blotchcols], 'blotch')



In [ ]:

    
p4id.plot_blotches(blotches=cluster.loc[indices[0]], with_center=True)



In [ ]:

    
df = cluster.loc[indices[0]][blotchcols]



In [ ]:

    
df['area'] = df.apply(lambda x: np.pi*x.radius_1*x.radius_2, axis=1)



In [ ]:

    
df



In [ ]:

    
col='radius_1'



In [ ]:

    
df.radius_1.std()



In [ ]:

    
df[np.abs(df[col]-df[col].mean())<=(1*df[col].std())]



In [ ]:

    
df[df.apply(lambda x: np.abs(x - x.mean()) / x.std() < 1).all(axis=1)]



In [ ]:

    
subclus



In [ ]:

    
testblotch = markings.Blotch?



In [ ]:

    
testblotchdata = dict(x=340, y=340, angle=127, radius_1=250, radius_2=186)



In [ ]:

    
testblotch = markings.Blotch(
    pd.DataFrame(
        testblotchdata, index=[0]), scope='planet4')
fig, ax = plt.subplots()
ax.add_artist(testblotch)
ax.set_xlim(0, 800)
ax.set_ylim(0, 600)



In [ ]:

    
testblotch = markings.Blotch(
    pd.DataFrame(testblotchdata, index=[0]),
    scope='planet4')

p4id.plot_blotches(blotches=[testblotch])



In [ ]:

    
from sklearn.cluster import DBSCAN


class DBScanner(object):
    """Execute clustering and create mean cluster markings.

    The instantiated object will execute:

        * _run_DBSCAN() to perform the clustering itself
        * _post_analysis() to create mean markings from the clustering results


    Parameters
    ----------
    current_X : numpy.array
        array holding the data to be clustered, preprocessed in ClusterManager
    eps : int, optional
        Distance criterion for DBSCAN algorithm. Samples further away than this value don't
        become members of the currently considered cluster. Default: 10
    min_samples : int, optional
        Mininum number of samples required for a cluster to be created. Default: 3
    """

    def __init__(self, X, eps=15, min_samples=3, only_core=False):
        self.X = X
        self.eps = eps
        self.min_samples = min_samples
        self.only_core = only_core

        # these lines execute the clustering
        self._run_DBSCAN()

    def _run_DBSCAN(self):
        """Perform the DBSCAN clustering."""
        db = DBSCAN(self.eps, self.min_samples).fit(self.X)
        core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
        core_samples_mask[db.core_sample_indices_] = True

        labels = db.labels_
        unique_labels = set(labels)
        colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))

        self.n_clusters_ = len(unique_labels) - (1 if -1 in labels else 0)

        self.clustered_indices = []  # list of `kind` cluster average objects
        self.n_rejected = 0
        # loop over unique labels.
        for k, col in zip(unique_labels, colors):
            # get indices for members of this cluster
            class_member_mask = (labels == k)
            if self.only_core:
                cluster_members = (class_member_mask & core_samples_mask)
            else:
                cluster_members = class_member_mask

            if k == -1:
                col = 'black'
                self.n_rejected = len(cluster_members)
            else:
                xy = self.X[cluster_members]
                if xy.shape[1] > 1:
                    y = xy[:, 1]
                else:
                    y = [0] * xy.shape[0]
                plt.plot(
                    xy[:, 0],
                    y,
                    'o',
                    markerfacecolor=col,
                    markeredgecolor='black',
                    markersize=14)
                xy = self.X[class_member_mask & ~core_samples_mask]
                if xy.shape[1] > 1:
                    y = xy[:, 1]
                else:
                    y = [0] * xy.shape[0]
                plt.plot(
                    xy[:, 0],
                    y,
                    'o',
                    markerfacecolor=col,
                    markeredgecolor='black',
                    markersize=6)
                self.clustered_indices.append(cluster_members)
        plt.gca().invert_yaxis()
        plt.title('Estimated number of clusters: %d' % self.n_clusters_)
        self.db = db



In [ ]:

    
cluster[blotchcols]



In [ ]:

    
xy_angles = clustering.angle_to_xy(cluster.angle, 'blotch')



In [ ]:

    
xy_angles



In [ ]:

    
xy_angles.shape



In [ ]:

    
plt.figure(figsize=(5*1.3,5))
clusterer = DBScanner(xy_angles, eps=20*np.pi/360, min_samples=3)



In [ ]:

    
data.loc[clusterer.clustered_indices[1]]



In [ ]:

    
for cluster_members in clusterer.clustered_indices:
    clusterdata = data.loc[cluster_members, blotchcols + ['user_name']]
    print(len(clusterdata))
    angle_clustered = clustering.cluster_angles(clusterdata, 'blotch')
    for indices in angle_clustered:
        angle_clusterdata = clusterdata.loc[indices, blotchcols +
                                            ['user_name']]
        filtered = angle_clusterdata.groupby('user_name').first()
        print(len(filtered))



In [ ]:

    
cm.min_samples



In [ ]:

    
30* cm.min_samples_factor



In [ ]:

    
cm.reduced_data['blotch']



In [ ]:

    
cm.cluster_angles



In [ ]:

    
db = clustering.cluster_angles(cluster, 'blotch')
len(db[0])



In [ ]:

    
len(cluster)



In [ ]:



In [ ]:



In [ ]:



In [ ]:

    
filtered = cluster.groupby('user_name').first()



In [ ]:

    
plt.figure()
filtered.angle.hist()



In [ ]:



In [ ]:

    
toprint = cluster2[markings.Fan.to_average + ['user_name', 'marking', 'classification_id']]



In [ ]:

    
toprint.to_clipboard(index=False)



In [ ]:

    
def add_angle_vector(df):
    new = df.copy()
    new['xang'] = np.cos(np.deg2rad(df.angle))
    new['yang'] = np.sin(np.deg2rad(df.angle))
    return new



In [ ]:

    
cluster2 = add_angle_vector(cluster2)



In [ ]:

    
cluster2

testing angle deltas



In [ ]:

    
def angle_to_xy(angle):
    x = np.cos(np.deg2rad(angle))
    y = np.sin(np.deg2rad(angle))
    return np.vstack([x,y]).T



In [ ]:

    
def cluster_angles(angles, delta_angle):
    dist_per_degree = 0.017453070996747883
    X = angle_to_xy(angles)
    clusterer = DBScanner(X, eps=delta_angle*dist_per_degree, min_samples=3)
    return clusterer



In [ ]:

    
clusterer = cluster_angles(cluster.angle, 10)



In [ ]:

    
clusterer.db.core_sample_indices_



In [ ]:

    
clusterer.db.labels_



In [ ]:

    
cluster.shape



In [ ]:

    
clusterer.clustered_indices



In [ ]:

    
cluster2.iloc[clusterer.clustered_data[0]]



In [ ]:



In [ ]:

    
dbscanner.reduced_data[0]

this means all ellipses were clustered together. eps=10 picks 3 out of these 6.



In [ ]:

    
clusterdata = data.iloc[dbscanner.reduced_data[0]]

so clusterdata is just the same as the input data, i just repeat the exact same code steps here for consistency.



In [ ]:

    
clusterdata[blotchcols]



In [ ]:



In [ ]:

    
meandata = clusterdata.mean()
meandata



In [ ]:

    
from scipy.stats import circmean



In [ ]:

    
meandata.angle = circmean(clusterdata.angle, high=180)



In [ ]:

    
meandata



In [ ]:

    
n_class_old = data.classification_id.nunique()
n_class_old



In [ ]:

    
# number of classifications that include fan and blotches
f1 = data.marking == 'fan'
f2 = data.marking == 'blotch'
n_class_fb = data[f1 | f2].classification_id.nunique()
n_class_fb



In [ ]:

    
data=data[data.marking=='blotch']



In [ ]:

    
plotting.plot_raw_blotches('bvc')



In [ ]:

    
fans.plot(kind='scatter', x='x',y='y')
plt.gca().invert_yaxis()



In [ ]:

    
fx1 = data.x < 400 
fx2 = data.x > 300
fy1 = data.y_R > 300
fy2 = data.y_R < 400



In [ ]:

    
data = data.reset_index()



In [ ]:

    
data[fx1 & fx2 & fy1 & fy2].angle



In [ ]:

    
cm.dbscanner.reduced_data

testing cluster_image_name



In [ ]:

    
dbscanner = dbscan.DBScanner()



In [ ]:

    
db = io.DBManager()



In [ ]:

    
data = db.get_obsid_markings('ESP_020568_0950')



In [ ]:

    
image_ids = data.image_id.unique()



In [ ]:

    
%matplotlib nbagg
import seaborn as sns
sns.set_context('notebook')



In [ ]:

    
p4id = markings.ImageID(image_ids[0])
p4id.plot_fans()



In [ ]:

    
p4id.plot_fans(data=p4id.data.query('angle>180'))



In [ ]:

    
p4id.imgid



In [ ]:

    
data[data.marking=='fan'].angle.describe()



In [ ]:

    
dbscanner.cluster_image_name('PSP_002622_0945')



In [ ]:

    
db = io.DBManager()



In [ ]:

    
db.get_image_name_markings('PSP_002622_0945')

Cluster random samples of obsids



In [ ]:

    
obsids = 'ESP_020476_0950, ESP_011931_0945, ESP_012643_0945, ESP_020783_0950'.split(', ')



In [ ]:

    
obsids



In [ ]:

    
def process_obsid(obsid):
    from planet4.catalog_production import do_cluster_obsids
    do_cluster_obsids(obsid, savedir=obsid)
    return obsid



In [ ]:

    
from nbtools import execute_in_parallel



In [ ]:

    
execute_in_parallel(process_obsid, obsids)



In [ ]:

    
db = io.DBManager()

for obsid in obsids:
    data = db.get_image_name_markings(obsid)
    image_ids = data.image_id.drop_duplicates().sample(n=50)
    for id_ in image_ids:
        print(id_)
        plotting.plot_image_id_pipeline(id_, datapath=obsid, save=True,
                                        saveroot=f'plots/{obsid}',
                                        via_obsid=True)
        plt.close('all')



In [ ]:

    
plotting.plot_finals('prv', datapath=obsids[0], via_obsid=True)



In [ ]:

Table of Contents

Setup ids

single item checking

testing angle deltas

testing cluster_image_name

Cluster random samples of obsids